input_field: text
filters:
  # The filters below define a chain of heuristic filters to be applied to each document in a corpus.
  # This particular cascade of filters is intended to filter Python code data.
  # The filter listed at the top will be applied first, and the following filters will be applied in
  # the order they appear in this file. Each filter can be removed and re-ordered as desired.
  # Change this based on the language of the data
  # Code filter implementations are in nemo_curator/filter/code.py
  - name: nemo_curator.filters.code.PythonCommentToCodeFilter
    params:
      min_comment_to_code_ratio: 0.001
      max_comment_to_code_ratio: 0.85
  - name: nemo_curator.filters.code.NumberOfLinesOfCodeFilter
    params:
      min_lines: 5
      max_lines: 20000
  - name: nemo_curator.filters.code.TokenizerFertilityFilter
    params:
      path_to_tokenizer: <Specify a path to a tokenizer model>
      min_char_to_token_ratio: 2
